In [1]:
import pandas as pd

# Load the dataset
df = pd.read_csv('AI in Healthcare Ethiopia (Responses) - Form Responses 1.csv')

# Quick look at the first few rows
df.head()
Out[1]:
Timestamp Age Sex Marital Status Which Region are currently working in? Level of Education Monthly Income (in Birr) Professional Role How many years of experience do you have in your fied? Artificial Intelligence (AI) is like a smart software or a tool that can do tasks like a human. ... AI might replace human doctors in the future. Benefits outweigh risks; improvements are worth the risks. Doctors and healthcare students should learn about AI. AI in healthcare needs regulation and responsible use. More research is needed to understand AI’s impact. I use AI tools in my healthcare profession only if they are validated I’m confident and comfortable using AI tools. I verify AI outputs before implementing them. I trust AI results and recommend them to colleagues I keep up with AI developments in healthcare.
0 3/31/2024 10:52:36 32 Female Single Addis Ababa Bachelor Degree 10000 Physician / Doctor / Surgeon 0 TRUE ... Disagree Agree Strongly Agree Strongly Agree Strongly Agree Never Never Never Never Always
1 3/31/2024 11:18:57 26 Male Single Addis Ababa Bachelor Degree 200 Physician / Doctor / Surgeon 2 TRUE ... Disagree Agree Agree Strongly Agree Strongly Agree Always Always Always Always Always
2 3/31/2024 11:30:48 30 Male Married Gambella Bachelor Degree 8018 Public Health Officer 5 TRUE ... Strongly Agree Agree Agree Agree Strongly Agree Always Always Always Always Always
3 3/31/2024 12:09:09 26 Male Single SNNPR Postgraduate Degree (Master’s, Professional, o... 8000 Public Health Officer 3 TRUE ... Disagree Strongly Agree Strongly Agree Strongly Agree Strongly Agree Always Always Always Always Always
4 3/31/2024 12:12:13 24 Female Single Amhara Postgraduate Degree (Master’s, Professional, o... 9000 Biostatistician 1 TRUE ... Neutral Strongly Agree Strongly Agree Strongly Agree Strongly Agree Always Always Never Never Always

5 rows × 30 columns

In [3]:
# Define a dictionary of {Old Name : New Name}
new_names = {
    'Marital Status': 'Marital_Status',
    'Which Region are currently working in?': 'Region',
    'Level of Education': 'Education_Level',
    'Monthly Income (in Birr)': 'Income',
    'Professional Role': 'Role',
    'How many years of experience do you have in your fied?': 'Experience_Years',

    # Knowledge Questions
    'Artificial Intelligence (AI) is like a smart software or a tool that can do tasks like a human.': 'Knw_AI_Definition',
    'AI learns from data it’s trained with and can understand everyday language to handle complex tasks.': 'Knw_AI_Learning',
    'AI assists healthcare professionals by enhancing diagnosis, treatment, research, education, and management.': 'Knw_AI_Assist',
    'AI helps diagnose diseases by analyzing images, symptoms, test results, and other data.': 'Knw_AI_Diagnosis',
    'AI suggests optimal medicines and dosages for treatment.': 'Knw_AI_Treatment',
    'AI is used in radiology, pathology, surgery, pharmacy, research, and public health.': 'Knw_AI_Usage',
    'Limitations include data quality, privacy, security, bias, errors, and ethical issues.': 'Knw_AI_Limitations',
    'Ethical concerns involve informed consent, accountability, responsibility, and transparency.': 'Knw_AI_Ethics',

    # Attitude Questions
    'AI can solve complex problems and improve health outcomes.': 'Att_Solve_Problems',
    'AI enhances accessibility, especially for remote areas.': 'Att_Accessibility',
    'AI reduces workload for healthcare professionals and optimizes resources.': 'Att_Workload',
    'AI might replace human doctors in the future.': 'Att_Replace_Doctors',
    'Benefits outweigh risks; improvements are worth the risks.': 'Att_Benefits_Risks',
    'Doctors and healthcare students should learn about AI.': 'Att_Learn_AI',
    'AI in healthcare needs regulation and responsible use.': 'Att_Regulation',
    'More research is needed to understand AI’s impact.': 'Att_Research_Need',

    # Practice Questions
    'I use AI tools in my healthcare profession only if they are validated': 'Prac_Use_Validated',
    'I’m confident and comfortable using AI tools.': 'Prac_Confidence',
    'I verify AI outputs before implementing them.': 'Prac_Verify_Output',
    'I trust AI results and recommend them to colleagues': 'Prac_Trust_Recommend',
    'I keep up with AI developments in healthcare.': 'Prac_Keep_Updated'
}

# Rename the columns
df = df.rename(columns=new_names)
In [4]:
def clean_role(text):
    if not isinstance(text, str):
        return "Other"
    
    text = text.lower().strip()
    
    # Grouping Logic
    if 'nurse' in text or 'midwife' in text:
        return 'Nurse/Midwife'
    elif 'doctor' in text or 'physician' in text or 'surgeon' in text or 'gp' in text or 'intern' in text:
        return 'Medical Doctor'
    elif 'public health' in text or 'epidemiologist' in text or 'environment' in text:
        return 'Public Health'
    elif 'pharm' in text:
        return 'Pharmacist'
    elif 'lab' in text or 'microbiolog' in text or 'biomed' in text:
        return 'Lab/Biomedical'
    elif 'lecturer' in text or 'teacher' in text or 'lecurer' in text: # Fix typo 'lecurer'
        return 'Academic/Lecturer'
    elif 'data' in text or 'statistic' in text or 'informatic' in text:
        return 'Data/IT/Stats'
    elif 'anesthe' in text or 'ansthetist' in text: # Fix typo 'ansthetist'
        return 'Anesthetist'
    elif 'student' in text:
        return 'Student'
    else:
        return 'Other'

# Apply the function
df['Role'] = df['Role'].apply(clean_role)
In [5]:
def clean_education(text):
    if not isinstance(text, str):
        return "Unknown"
    
    if 'Bachelor' in text:
        return 'Bachelor'
    elif 'Master' in text or 'Postgraduate' in text:
        return 'Masters/Postgrad'
    elif 'Doctoral' in text or 'PhD' in text:
        return 'PhD'
    else:
        return 'Other'

df['Education_Level'] = df['Education_Level'].apply(clean_education)
In [6]:
def clean_marital(text):
    if not isinstance(text, str):
        return "Unknown"
    return text.split(' / ')[0] # Simplifies "Divorced / Widowed" to just "Divorced" or similar if needed

df['Marital_Status'] = df['Marital_Status'].apply(clean_marital)
In [7]:
import pandas as pd

# 1. Define the cleaning function
def clean_region_v2(text):
    if not isinstance(text, str):
        return "Unknown"
    
    # Convert to lowercase and strip spaces for easier matching
    t = text.lower().strip()
    
    # --- REGION MATCHING LOGIC ---
    
    # 1. Addis Ababa (Handles: "A.A", "Addis", "A̲. A̲", "Finfinne")
    if any(x in t for x in ['addis', 'adis', 'aa', 'a.a', 'finfinne', 'a̲']):
        return 'Addis Ababa'
    
    # 2. Oromia (Handles: "Jimma", "Shoa", "Hareghe", "Meta welabu", "Limmu")
    if any(x in t for x in ['oromia', 'oromiya', 'jimma', 'shoa', 'hareghe', 'meta welabu', 'limmu']):
        return 'Oromia'
    
    # 3. Amhara
    if 'amhara' in t:
        return 'Amhara'
    
    # 4. Sidama (Handles: "Sidaamu", "Sidam")
    if any(x in t for x in ['sidama', 'sidaamu', 'sidam']):
        return 'Sidama'
    
    # 5. Southern / Central / South West 
    # A. South West Ethiopia 
    # (Must check this BEFORE "South Ethiopia" to catch 'South West' correctly)
    if any(x in t for x in ['swe', 'south west']):
        return 'South West Ethiopia'
    
    # B. Central Ethiopia
    if any(x in t for x in ['central', 'centeral', 'gurage', 'werabe']):
        return 'Central Ethiopia'
        
    # C. South Ethiopia
    if any(x in t for x in ['snnpr', 'snne', 'southern', 'south', 'debub', 'amu']):
        return 'South Ethiopia'
    # 6. Somali (Handles: "Jijiga")
    if any(x in t for x in ['somali', 'somale', 'jijiga']):
        return 'Somali'
    
    # 7. Afar
    if 'afar' in t:
        return 'Afar'
    
    # 8. Harari
    if 'harar' in t:
        return 'Harari'
    
    # 9. Dire Dawa
    if 'dire' in t:
        return 'Dire Dawa'
    
    # 10. Gambella
    if 'gambella' in t:
        return 'Gambella'
    
    # 11. Benishangul Gumuz
    if 'benishangul' in t or 'beneshagul' in t or 'gumuz' in t:
        return 'Benishangul Gumuz'
    
    # 12. Tigray
    if 'tigray' in t:
        return 'Tigray'

    # --- EXCLUSION LOGIC ---
    # If it didn't match a region above, check if it's explicitly invalid data
    if any(x in t for x in ['student', 'job', 'unemployed', 'orthodox', 'muslim', 
                            'protestant', 'evangelical', 'islam', 'sudan', 'afghanistan', 
                            '04', 'east africa', 'graduate', 'nursing']):
        return 'Other/Invalid'
    
    # Default for anything else not caught
    return 'Other/Invalid'

# 2. Apply the function
df['Region'] = df['Region'].apply(clean_region_v2)

# 3. Verify the results
print(df['Region'].value_counts())
Region
Addis Ababa            166
Oromia                 113
Other/Invalid           28
Sidama                  27
Amhara                  24
Central Ethiopia        23
South Ethiopia          18
Somali                  10
South West Ethiopia      8
Harari                   8
Afar                     8
Tigray                   6
Dire Dawa                4
Gambella                 3
Benishangul Gumuz        3
Name: count, dtype: int64
In [8]:
# Convert all roles to Title Case (e.g., "public health" -> "Public Health")
df['Role'] = df['Role'].str.title()
In [9]:
# Convert 'Region' to Title Case (e.g., "addis ababa" -> "Addis Ababa")
df['Region'] = df['Region'].str.title()
In [10]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])
In [11]:
df.to_csv('AI_Healthcare_Cleaned.csv', index=False)
In [19]:
import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd

# 1. Load Your Data
df = pd.read_csv('AI_Healthcare_Cleaned.csv')

# 2. Calculate "Adoption Rate" by Region
# We use .agg() to avoid the warning and calculate the % of High Usage
region_stats = df.groupby('Region').agg(
    Adoption_Rate=('Prac_Use_Validated', lambda x: x.isin(['Often', 'Always']).mean() * 100),
    Respondent_Count=('Region', 'count')
).reset_index()

# 3. Load the Shapefile (The Map Borders)
# We specify 'layer="eth_admin1"' to get Regions, not the whole country
shapefile_path = "eth_admin_boundaries.shp" 
gdf = gpd.read_file(shapefile_path, layer="eth_admin1")

# 4. Fix Spelling Mismatches (Crucial Step!)
# This maps your Survey Names (Left) to the Map File Names (Right)
name_fix = {
    'Benishangul Gumuz': 'Benishangul Gumz',
    'Gambella': 'Gambela',
    'Central Ethiopia': 'SNNP', # Mapping new regions to the old SNNP shape
    'South Ethiopia': 'SNNP',
    'South West Ethiopia': 'SNNP' # Often part of SNNP in older maps, or check if map has it
}
# Apply the fix
region_stats['Region_Map_Name'] = region_stats['Region'].replace(name_fix)

# 5. Merge Data
# We group by the NEW map name to handle the SNNP merge (combining South + Central)
final_stats = region_stats.groupby('Region_Map_Name')['Adoption_Rate'].mean().reset_index()

# Join with the shapefile
merged = gdf.set_index('adm1_name').join(final_stats.set_index('Region_Map_Name'))

# 6. Plot the Map
fig, ax = plt.subplots(1, 1, figsize=(12, 12))
merged.plot(column='Adoption_Rate', 
            cmap='OrRd',      # Orange to Red color scheme
            linewidth=0.8, 
            ax=ax, 
            edgecolor='0.8', 
            legend=True,
            legend_kwds={'label': "AI Adoption Rate (%)", 'orientation': "horizontal"},
            missing_kwds={'color': 'lightgrey', 'label': 'No Data'})

ax.axis('off')
ax.set_title('AI Adoption Intensity by Region in Ethiopia', fontdict={'fontsize': '15', 'fontweight' : '3'})
plt.show()
No description has been provided for this image
In [20]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set visual style for professional reports
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# Load the data (Make sure the file is in the same folder)
df = pd.read_csv('AI_Healthcare_Cleaned.csv')

# Quick check
print(f"Total Respondents: {len(df)}")
df.head(3)
Total Respondents: 449
Out[20]:
Timestamp Age Sex Marital_Status Region Education_Level Income Role Experience_Years Knw_AI_Definition ... Att_Replace_Doctors Att_Benefits_Risks Att_Learn_AI Att_Regulation Att_Research_Need Prac_Use_Validated Prac_Confidence Prac_Verify_Output Prac_Trust_Recommend Prac_Keep_Updated
0 2024-03-31 10:52:36 32 Female Single Addis Ababa Bachelor 10000 Medical Doctor 0 TRUE ... Disagree Agree Strongly Agree Strongly Agree Strongly Agree Never Never Never Never Always
1 2024-03-31 11:18:57 26 Male Single Addis Ababa Bachelor 200 Medical Doctor 2 TRUE ... Disagree Agree Agree Strongly Agree Strongly Agree Always Always Always Always Always
2 2024-03-31 11:30:48 30 Male Married Gambella Bachelor 8018 Public Health 5 TRUE ... Strongly Agree Agree Agree Agree Strongly Agree Always Always Always Always Always

3 rows × 30 columns

In [23]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Create a dashboard for demographics
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# 1. Professional Roles (Top 5)
top_roles = df['Role'].value_counts().nlargest(5)

# --- THE FIX IS HERE ---
# We added 'hue=top_roles.index' and 'legend=False'
sns.barplot(
    x=top_roles.values, 
    y=top_roles.index, 
    hue=top_roles.index,  # Explicitly tell it to color based on the Role name
    legend=False,         # Hide the legend (redundant since y-axis has labels)
    ax=axes[0], 
    palette='viridis'
)
axes[0].set_title('Top 5 Professional Roles')
axes[0].set_xlabel('Number of Respondents')

# 2. Experience Distribution
# Ensure experience is numeric
df['Experience_Years'] = pd.to_numeric(df['Experience_Years'], errors='coerce')
sns.histplot(df['Experience_Years'].dropna(), bins=15, kde=True, ax=axes[1], color='#3498db')
axes[1].set_title('Years of Experience Distribution')
axes[1].set_xlabel('Years')

plt.tight_layout()
plt.show()
No description has been provided for this image
In [24]:
# Cell 3: Attitude Analysis (The "Fear" vs. "Optimism" Gap)

# Define the order for Likert scales so charts make sense
likert_order = ['Strongly Disagree', 'Disagree', 'Neutral', 'Agree', 'Strongly Agree']

# Create a comparison plot
fig, ax = plt.subplots(1, 2, figsize=(16, 6))

# Plot 1: Will AI replace doctors?
sns.countplot(
    x='Att_Replace_Doctors', 
    hue='Att_Replace_Doctors',  # FIX 1: Explicitly link color to the x-variable
    data=df, 
    order=likert_order, 
    ax=ax[0], 
    palette='RdYlBu',
    legend=False                # FIX 2: Turn off the legend since we have labels
)
ax[0].set_title('Statement: "AI will replace human doctors"')
ax[0].tick_params(axis='x', labelrotation=45) # FIX 3: The modern way to rotate labels

# Plot 2: Do benefits outweigh risks?
sns.countplot(
    x='Att_Benefits_Risks', 
    hue='Att_Benefits_Risks',   # FIX 1
    data=df, 
    order=likert_order, 
    ax=ax[1], 
    palette='RdYlGn',
    legend=False                # FIX 2
)
ax[1].set_title('Statement: "Benefits outweigh the risks"')
ax[1].tick_params(axis='x', labelrotation=45) # FIX 3

plt.tight_layout()
plt.show()

# Calculate exact percentage for your report text
fear_pct = df['Att_Replace_Doctors'].isin(['Agree', 'Strongly Agree']).mean() * 100
optimism_pct = df['Att_Benefits_Risks'].isin(['Agree', 'Strongly Agree']).mean() * 100
print(f"Fear Factor: {fear_pct:.1f}% believe they might be replaced.")
print(f"Optimism Factor: {optimism_pct:.1f}% believe benefits outweigh risks.")
No description has been provided for this image
Fear Factor: 33.0% believe they might be replaced.
Optimism Factor: 56.3% believe benefits outweigh risks.
In [26]:
# Cell 4: Practice Analysis (Actual Usage)

# Analyze usage frequency
usage_order = ['Never', 'Rarely', 'Sometimes', 'Often', 'Always']

plt.figure(figsize=(10, 6))

# --- THE FIX IS HERE ---
ax = sns.countplot(
    y='Prac_Use_Validated', 
    hue='Prac_Use_Validated',  # Explicitly tell it to color based on usage frequency
    data=df, 
    order=usage_order, 
    palette='magma',
    legend=False               # Hide the legend
)

plt.title('How often do professionals use validated AI tools?')
plt.xlabel('Count')

# Add percentages on the bars
total = len(df)
for p in ax.patches:
    # Only label bars that actually exist
    if p.get_width() > 0:
        percentage = '{:.1f}%'.format(100 * p.get_width()/total)
        x = p.get_x() + p.get_width() + 3
        y = p.get_y() + p.get_height()/2
        ax.annotate(percentage, (x, y), va='center')

plt.show()
No description has been provided for this image
In [27]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load Data
df = pd.read_csv('AI_Healthcare_Cleaned.csv')

# Define a "Modern" Color Palette (Teal, Slate, Coral - very clean)
colors_modern = ['#264653', '#2a9d8f', '#e9c46a', '#f4a261', '#e76f51']
In [28]:
# Create a hierarchy: Region -> Sex -> Usage
fig_sun = px.sunburst(df, 
                      path=['Region', 'Sex', 'Prac_Use_Validated'], 
                      title='<b>Digital Equity Map:</b> AI Adoption by Region & Gender',
                      color_discrete_sequence=px.colors.qualitative.Pastel)

fig_sun.update_layout(margin=dict(t=40, l=0, r=0, b=0), font_family="Arial")
fig_sun.show()
In [29]:
# Prepare Data: Group by Role to see market segments
# Calculate average Knowledge Score (0-8) and Trust %
# Convert Trust/Knowledge to numeric for plotting
knw_cols = [c for c in df.columns if c.startswith('Knw_')]
df['Knw_Score'] = df[knw_cols].astype(str).apply(lambda x: x.str.contains('TRUE|True', case=False)).sum(axis=1)

# Calculate Trust Score (Frequency converted to 1-5 scale for average)
freq_map = {'Never': 1, 'Rarely': 2, 'Sometimes': 3, 'Often': 4, 'Always': 5}
df['Trust_Score'] = df['Prac_Trust_Recommend'].map(freq_map)

# Group by Role
role_data = df.groupby('Role').agg(
    Avg_Knowledge=('Knw_Score', 'mean'),
    Avg_Trust=('Trust_Score', 'mean'),
    Count=('Role', 'count')
).reset_index()

# Filter for Roles with at least 5 people (to remove noise)
role_data = role_data[role_data['Count'] > 5]

fig_bubble = px.scatter(role_data, x="Avg_Knowledge", y="Avg_Trust",
                 size="Count", color="Role",
                 hover_name="Role",
                 title="<b>The Market Matrix:</b> Knowledge vs. Trust by Profession",
                 labels={"Avg_Knowledge": "AI Literacy (Score 0-8)", "Avg_Trust": "Trust Level (1-5)"},
                 size_max=60)

# Add a "Sweet Spot" box for investors
fig_bubble.add_shape(type="rect", x0=6, y0=3.5, x1=8, y1=5,
    line=dict(color="Green", width=2, dash="dot"),
)
fig_bubble.add_annotation(x=7, y=4.8, text="Ideally Positioned<br>Market", showarrow=False, font=dict(color="green"))

fig_bubble.update_layout(template="plotly_white")
fig_bubble.show()
In [30]:
# Calculate percentages
reg_agree = df['Att_Regulation'].isin(['Agree', 'Strongly Agree']).mean() * 100
fear_agree = df['Att_Replace_Doctors'].isin(['Agree', 'Strongly Agree']).mean() * 100
risk_agree = df['Att_Benefits_Risks'].isin(['Agree', 'Strongly Agree']).mean() * 100

# Data for Chart
categories = ['Demand for Regulation', 'Belief Benefits > Risks', 'Fear of Job Replacement']
values = [reg_agree, risk_agree, fear_agree]
colors = ['#2a9d8f', '#2a9d8f', '#e76f51'] # Green for good, Red for fear

fig_bar = go.Figure(go.Bar(
            x=values,
            y=categories,
            orientation='h',
            marker_color=colors,
            text=[f"{v:.1f}%" for v in values],
            textposition='auto'
))

fig_bar.update_layout(title_text='<b>Policy Pulse Check:</b> The Regulation vs. Fear Gap',
                      xaxis_title="Percentage of Workforce Agreeing",
                      template="plotly_white")
fig_bar.show()
In [31]:
import plotly.graph_objects as go

# Data Prep: Pivot Gender vs Role for 'Always' usage
heatmap_data = df[df['Prac_Use_Validated'] == 'Always'].groupby(['Role', 'Sex']).size().unstack(fill_value=0)

fig_heat = go.Figure(data=go.Heatmap(
                   z=heatmap_data.values,
                   x=heatmap_data.columns,
                   y=heatmap_data.index,
                   colorscale='Teal',
                   hoverongaps = False))

fig_heat.update_layout(title='<b>Gender Equity Heatmap:</b> Who are the Power Users?',
                       template='plotly_white')
fig_heat.show()
In [32]:
# Create Income Groups for clearer visualization
df['Income_Group'] = pd.qcut(pd.to_numeric(df['Income'], errors='coerce'), 4, labels=['Low', 'Medium', 'High', 'Elite'])

fig_violin = px.violin(df, y="Att_Replace_Doctors", x="Income_Group", color="Income_Group",
                       box=True, points="all",
                       title='<b>The Anxiety Curve:</b> Job Security Fear by Income Level',
                       category_orders={"Att_Replace_Doctors": ["Strongly Disagree", "Disagree", "Neutral", "Agree", "Strongly Agree"]},
                       color_discrete_sequence=px.colors.qualitative.Bold)

fig_violin.update_layout(yaxis_title="Fear Level", showlegend=False)
fig_violin.show()
In [33]:
# Prep Data: Calculate average scores (0-100%) for 4 key metrics
def get_metrics(sub_df):
    return [
        sub_df['Knw_AI_Definition'].apply(lambda x: 1 if str(x).upper()=='TRUE' else 0).mean(),
        sub_df['Prac_Use_Validated'].apply(lambda x: 1 if x in ['Often', 'Always'] else 0).mean(),
        sub_df['Att_Replace_Doctors'].isin(['Agree', 'Strongly Agree']).mean(),
        sub_df['Att_Regulation'].isin(['Agree', 'Strongly Agree']).mean()
    ]

metrics_addis = get_metrics(df[df['Region'] == 'Addis Ababa'])
metrics_regions = get_metrics(df[df['Region'] != 'Addis Ababa'])

categories = ['Knowledge', 'High Usage', 'Fear of Job Loss', 'Demand for Regulation']

fig_radar = go.Figure()

fig_radar.add_trace(go.Scatterpolar(
      r=metrics_addis,
      theta=categories,
      fill='toself',
      name='Addis Ababa (Capital)'
))
fig_radar.add_trace(go.Scatterpolar(
      r=metrics_regions,
      theta=categories,
      fill='toself',
      name='Regional Ethiopia'
))

fig_radar.update_layout(
  polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
  title='<b>The Digital Divide Radar:</b> Capital vs. Regions',
  showlegend=True
)
fig_radar.show()
In [ ]: